# 加载CatBoost包
# if (!require(catboost)) install.packages("catboost")
#devtools::install_github("catboost/catboost", subdir="catboost/R-package")
library(catboost)
library(caret)
library(pROC)
library(MatchIt)
library(ggplot2)
library(extrafont)
library(dplyr)

# 设置工作目录并加载数据
setwd("${path}")
mydata <- read.csv("bibliometric_nuomotu.csv")

cols<-colnames(mydata)[3:38]
independent_and<- paste0(cols,collapse = "+")
independent_dou<- paste0(cols,collapse = "\",\"")

# 确保 group_best 是因子，并且值为 "event_0" 和 "event_1"
mydata$group_best <- factor(mydata$group_best, levels = c("0", "1"), labels = c("event_0", "event_1"))


# 使用 matchit 进行 1:3 匹配
m.out <- matchit(as.formula(paste0("group_best~",independent_and)), 
                 data = mydata, 
                 method = "nearest", 
                 ratio = 3)

# 提取匹配后的数据
matched_data <- match.data(m.out)
# 创建特征矩阵（不包括目标变量）
feature_names <- c(cols)
X <- as.matrix(matched_data[, feature_names])
colnames(matched_data)
# 创建标签向量
y <- as.numeric(matched_data$group_best) - 1

# 创建 CatBoost Pool 对象
train_pool <- catboost.load_pool(data = X, label = y)
# 验证 Pool 对象是否创建成功
if (!inherits(train_pool, "catboost.Pool")) {
  stop("未能正确创建 Pool 对象，请检查数据格式。")
}
# 设置模型参数
params <- list(
  loss_function = "Logloss",
  eval_metric = "AUC",
  iterations = 1000,
  depth = 6,
  learning_rate = 0.1,
  verbose = 0,  # 关闭详细输出，或者设置为正整数控制输出频率
  prediction_type = "RawFormulaVal"  # 指定输出预测值的格式
)

# 训练 CatBoost 模型
model_catboost <- catboost.train(learn_pool = train_pool, params = params)


# 如果有测试集，也可以创建一个测试 Pool 对象
# 注意：这里假设 X_test 是您的测试特征矩阵，y_test 是您的测试标签向量
if (exists("X") && exists("y")) {
  test_pool <- catboost.load_pool(data = as.matrix(mydata[, feature_names]), label = as.numeric(mydata$group_best) - 1)
} else {
  message("测试集未提供，将使用训练集进行预测。")
  test_pool <- train_pool
}

# 预测和评估
predictions_catboost <- catboost.predict(model = model_catboost, pool = test_pool)

# 将预测概率转换为分类结果
predicted_probs_catboost <- predictions_catboost
predicted_classes_catboost <- factor(ifelse(predicted_probs_catboost >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 添加预测结果到数据集
mydata_with_predictions_catboost <- mydata %>%
  mutate(
    group_best_probability = predicted_probs_catboost,
    group_best_predicted_class = predicted_classes_catboost,
    group_best_score = NA  # CatBoost没有评分卡
  )

# 计算 group_best 和 group_best_predicted_class 相同的数据数量
matching_rows <- mydata_with_predictions_catboost %>%
  filter(group_best == group_best_predicted_class) %>%
  nrow()
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", matching_rows, "\n")
# 可选：计算准确率（匹配行数 / 总行数）
total_rows <- nrow(matched_data)
accuracy <- matching_rows / total_rows
# 打印结果
cat("Number of rows where group_best and group_best_predicted_class match:", accuracy, "\n")
# 这是建模集的结果
write.csv(mydata_with_predictions_catboost, file = "mydata_with_predictions.csv", row.names = FALSE)





#### 特征重要性图 ####

if (exists("X") && exists("y")) {
  test_pool <- catboost.load_pool(data = as.matrix(matched_data[, feature_names]), label = as.numeric(matched_data$group_best) - 1)
} else {
  message("测试集未提供，将使用训练集进行预测。")
  test_pool <- train_pool
}

# 预测和评估
predictions_catboost <- catboost.predict(model = model_catboost, pool = test_pool)

# 将预测概率转换为分类结果
predicted_probs_catboost <- predictions_catboost
predicted_classes_catboost <- factor(ifelse(predicted_probs_catboost >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 获取特征重要性
feature_importance <- catboost.get_feature_importance(model_catboost, pool = train_pool)

# 创建数据框用于绘图
imp_df <- data.frame(
  Feature = feature_names,
  Importance = as.numeric(feature_importance)
)

# 按照重要性降序排列并绘制图形
imp_df <- imp_df[order(-imp_df$Importance), ]
p_feature_importance <- ggplot(imp_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "#377eb8", colour = "black") +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    axis.ticks.length = unit(-0.25, "cm"),
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(color = "black"),
    legend.position = "none"
  ) +
  labs(title = "Feature Importance Scores", x = "", y = "Importance Score") +
  annotate("text", x = Inf, y = -Inf, hjust = 1, vjust = -0.5, label = "Source: Your Analysis Methodology", size = 3)
png(file="importanceFeatures_catBoost.png", width=800, height=800)
print(p_feature_importance)
dev.off()
#### 混淆矩阵 ####
library(caret)

# 确保预测结果和真实标签长度一致
if (length(predicted_classes_catboost) != nrow(matched_data)) {
  stop("Predicted classes and actual labels do not have the same length.")
}

# 计算混淆矩阵
conf_matrix <- confusionMatrix(data = factor(predicted_classes_catboost),
                               reference = factor(matched_data$group_best, levels = c("event_0", "event_1")))

# 打印混淆矩阵
# sink("confusionMatrix_catBoost.txt")
# print(conf_matrix)
# sink()

#### ROC 曲线 ####
# 计算并绘制ROC曲线
roc_curve <- roc(matched_data$group_best, predicted_probs_catboost)

# 添加AUC值到图表中
auc_roc <- auc(roc_curve)
# png(file="roc_catBoost.tiff", width=800, height=800,units="in",dpi=150)
png(file="roc_catBoost.png", width=800, height=800)
plot(roc_curve, main = "ROC Curve", col = "#e41a1c", lwd = 2)
legend("bottomright", legend = paste("AUC =", round(auc_roc, 2)), bty = "n")
dev.off()
#### 校准图 ####
library(ggplot2)

predicted_probs_catboost <- as.numeric(predicted_probs_catboost)
actual_labels <- as.numeric(matched_data$group_best) - 1 # 将因子转换为数值型（0和1）

# 创建二元逻辑回归模型以获取预期概率
calib_model <- glm(actual_labels ~ predicted_probs_catboost, family = binomial())

# 预测校准概率
calib_probs <- predict(calib_model, type = "response")

# 绘制校准图
calibration_plot <- data.frame(
  Predicted = predicted_probs_catboost,
  Actual = calib_probs
)

png(file="cal_catBoost.png", width=800, height=800)
ggplot(calibration_plot, aes(x = Predicted, y = Actual)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  geom_smooth(method = "loess", se = FALSE, color = "#377eb8") +
  theme_minimal(base_family = "Arial", base_size = 12) +
  labs(title = "Calibration Plot", x = "Predicted Probability", y = "Observed Fraction of Positives")
dev.off()

# mydata1 <- read.csv("result.csv")
mydata1 <- read.csv("bibliometric_nuomotuRes.csv")
test_pool <- catboost.load_pool(data = as.matrix(mydata1[, feature_names]), label = as.numeric(mydata1$group_best) - 1)
# 预测和评估
predictions_catboost <- catboost.predict(model = model_catboost, pool = test_pool)

# 将预测概率转换为分类结果
predicted_probs_catboost <- predictions_catboost
predicted_classes_catboost <- factor(ifelse(predicted_probs_catboost >= 0.5, "event_1", "event_0"), levels = c("event_0", "event_1"))

# 添加预测结果到数据集
result_with_predictions_catboost <- mydata1 %>%
  mutate(
    group_best_probability = predicted_probs_catboost,
    group_best_predicted_class = predicted_classes_catboost,
    group_best_score = NA  # CatBoost没有评分卡
  )
# 这是结局的结果
# write.csv(result_with_predictions_catboost, file = "result_with_predictions_catboost.csv", row.names = FALSE)
write.csv(result_with_predictions_catboost, file = "result_with_predictions.csv", row.names = FALSE)
